Process Breast Cancer RNASeq Data

geneExp_T=read.csv(file = "../TCGA_BRCA_Tumor_RSEM_GeneExp.tsv", header = T, sep = "\t", as.is = T, row.names = NULL)

geneExp_T[1:5,1:5]
##   row.names TCGA.BH.A0W3.01A.11R.A109.07 TCGA.A8.A07F.01A.11R.A00Z.07
## 1    TSPAN6                    4.4098524                  11.52828776
## 2      TNMD                    0.1194379                   0.04438564
## 3      DPM1                   29.6726970                  28.00414215
## 4     SCYL3                    6.5723377                   6.70692039
## 5  C1orf112                    2.8762074                   4.81435840
##   TCGA.GM.A3XL.01A.11R.A22U.07 TCGA.LL.A441.01A.11R.A24H.07
## 1                   6.27288098                   17.8241862
## 2                   0.04878317                    0.6663316
## 3                  57.36674446                   56.1109712
## 4                   3.08093040                    7.1605879
## 5                   4.84118390                    2.9625904
geneExp_T_m=as.matrix(geneExp_T[,2:ncol(geneExp_T)])
rownames(geneExp_T_m)=geneExp_T$row.names

#Now it is a matrix format
geneExp_T_m[1:5,1:5]
##          TCGA.BH.A0W3.01A.11R.A109.07 TCGA.A8.A07F.01A.11R.A00Z.07
## TSPAN6                      4.4098524                  11.52828776
## TNMD                        0.1194379                   0.04438564
## DPM1                       29.6726970                  28.00414215
## SCYL3                       6.5723377                   6.70692039
## C1orf112                    2.8762074                   4.81435840
##          TCGA.GM.A3XL.01A.11R.A22U.07 TCGA.LL.A441.01A.11R.A24H.07
## TSPAN6                     6.27288098                   17.8241862
## TNMD                       0.04878317                    0.6663316
## DPM1                      57.36674446                   56.1109712
## SCYL3                      3.08093040                    7.1605879
## C1orf112                   4.84118390                    2.9625904
##          TCGA.BH.A0HP.01A.12R.A084.07
## TSPAN6                      23.008325
## TNMD                         0.528638
## DPM1                        24.222269
## SCYL3                        6.241249
## C1orf112                     2.429812
#Size of matrix
dim(geneExp_T_m)
## [1] 56499  1102
#Filter out Protein-coding genes
ENSEMBL_annotations=read.csv("Homo_sapiens.GRCh37.87_Annotations.txt",
                             header=FALSE,sep=" ",stringsAsFactors = F)
ENSEMBL_annotations_prot=unique(ENSEMBL_annotations[ENSEMBL_annotations$V3=="protein_coding",])

geneExp_T_m_prot=subset(geneExp_T_m,rownames(geneExp_T_m) %in% ENSEMBL_annotations_prot$V2)
#Now we have gene matrix (rows) consisting of protein-coding ONLY
dim(geneExp_T_m_prot)
## [1] 18390  1102
head(geneExp_T_m_prot)[,1:5]
##          TCGA.BH.A0W3.01A.11R.A109.07 TCGA.A8.A07F.01A.11R.A00Z.07
## TSPAN6                      4.4098524                  11.52828776
## TNMD                        0.1194379                   0.04438564
## DPM1                       29.6726970                  28.00414215
## SCYL3                       6.5723377                   6.70692039
## C1orf112                    2.8762074                   4.81435840
## FGR                         1.7643648                   1.36997556
##          TCGA.GM.A3XL.01A.11R.A22U.07 TCGA.LL.A441.01A.11R.A24H.07
## TSPAN6                     6.27288098                   17.8241862
## TNMD                       0.04878317                    0.6663316
## DPM1                      57.36674446                   56.1109712
## SCYL3                      3.08093040                    7.1605879
## C1orf112                   4.84118390                    2.9625904
## FGR                        5.60683479                    6.1761304
##          TCGA.BH.A0HP.01A.12R.A084.07
## TSPAN6                      23.008325
## TNMD                         0.528638
## DPM1                        24.222269
## SCYL3                        6.241249
## C1orf112                     2.429812
## FGR                          1.727848

Filter the Data and Clean up the matrix

#Remove <2 TPM genes; Only retain rows (genes) that show atleast 20% of columns (patients) have TPM values > 2.
TCGA_only_f1=geneExp_T_m_prot
TCGA_only_f1=geneExp_T_m_prot[(rowSums(geneExp_T_m_prot>2)>=as.integer(0.2*ncol(geneExp_T_m_prot))),] 
dim(TCGA_only_f1) 
## [1] 11116  1102
TCGA_only_f1[1:5,1:5]
##          TCGA.BH.A0W3.01A.11R.A109.07 TCGA.A8.A07F.01A.11R.A00Z.07
## TSPAN6                       4.409852                    11.528288
## DPM1                        29.672697                    28.004142
## SCYL3                        6.572338                     6.706920
## C1orf112                     2.876207                     4.814358
## FGR                          1.764365                     1.369976
##          TCGA.GM.A3XL.01A.11R.A22U.07 TCGA.LL.A441.01A.11R.A24H.07
## TSPAN6                       6.272881                    17.824186
## DPM1                        57.366744                    56.110971
## SCYL3                        3.080930                     7.160588
## C1orf112                     4.841184                     2.962590
## FGR                          5.606835                     6.176130
##          TCGA.BH.A0HP.01A.12R.A084.07
## TSPAN6                      23.008325
## DPM1                        24.222269
## SCYL3                        6.241249
## C1orf112                     2.429812
## FGR                          1.727848
#Quantile Normalize data
TCGA_only_f1_prot_QN=normalize.quantiles(TCGA_only_f1)
row.names(TCGA_only_f1_prot_QN)=row.names(TCGA_only_f1)
colnames(TCGA_only_f1_prot_QN)=colnames(TCGA_only_f1)
TCGA_only_f1_prot_QN[1:5,1:5]
##          TCGA.BH.A0W3.01A.11R.A109.07 TCGA.A8.A07F.01A.11R.A00Z.07
## TSPAN6                       3.728963                    10.173951
## DPM1                        28.178291                    27.724906
## SCYL3                        5.591500                     5.609570
## C1orf112                     2.555565                     4.040250
## FGR                          1.683587                     1.153433
##          TCGA.GM.A3XL.01A.11R.A22U.07 TCGA.LL.A441.01A.11R.A24H.07
## TSPAN6                       6.499250                    16.581872
## DPM1                        48.766935                    55.987137
## SCYL3                        3.536875                     5.977064
## C1orf112                     5.203306                     2.197796
## FGR                          5.906148                     5.035973
##          TCGA.BH.A0HP.01A.12R.A084.07
## TSPAN6                      24.185118
## DPM1                        25.407640
## SCYL3                        6.108447
## C1orf112                     2.168783
## FGR                          1.408451
#Log2 transform data
TCGA_only_f1_prot_QN_l2=log2(TCGA_only_f1_prot_QN+1)
TCGA_only_f1_prot_QN_l2[1:5,1:5]
##          TCGA.BH.A0W3.01A.11R.A109.07 TCGA.A8.A07F.01A.11R.A00Z.07
## TSPAN6                       2.241524                     3.482067
## DPM1                         4.866823                     4.844230
## SCYL3                        2.720607                     2.724556
## C1orf112                     1.830079                     2.333495
## FGR                          1.424163                     1.106638
##          TCGA.GM.A3XL.01A.11R.A22U.07 TCGA.LL.A441.01A.11R.A24H.07
## TSPAN6                       2.906746                     4.136017
## DPM1                         5.637116                     5.832564
## SCYL3                        2.181699                     2.802620
## C1orf112                     2.633037                     1.677078
## FGR                          2.787881                     2.593586
##          TCGA.BH.A0HP.01A.12R.A084.07
## TSPAN6                       4.654500
## DPM1                         4.722883
## SCYL3                        2.829534
## C1orf112                     1.663929
## FGR                          1.268105
#Mean-normalize matrix
TCGA_only_f1_prot_QN_l2_m=TCGA_only_f1_prot_QN_l2-apply(TCGA_only_f1_prot_QN_l2,1,mean)
TCGA_only_f1_prot_QN_l2_m[1:5,1:5]
##          TCGA.BH.A0W3.01A.11R.A109.07 TCGA.A8.A07F.01A.11R.A00Z.07
## TSPAN6                     -1.1274625                    0.1130812
## DPM1                       -0.1703116                   -0.1929048
## SCYL3                       0.2234674                    0.2274169
## C1orf112                    0.1588020                    0.6622186
## FGR                        -0.3458319                   -0.6633564
##          TCGA.GM.A3XL.01A.11R.A22U.07 TCGA.LL.A441.01A.11R.A24H.07
## TSPAN6                     -0.4622400                  0.767030468
## DPM1                        0.5999805                  0.795429314
## SCYL3                      -0.3154405                  0.305480522
## C1orf112                    0.9617604                  0.005801225
## FGR                         1.0178868                  0.823591895
##          TCGA.BH.A0HP.01A.12R.A084.07
## TSPAN6                    1.285513333
## DPM1                     -0.314251637
## SCYL3                     0.332394828
## C1orf112                 -0.007348134
## FGR                      -0.501889099
#Get top5K variable genes
var_TCGA_only=apply(TCGA_only_f1_prot_QN_l2, 1, var)
TCGA_only_f1_prot_QN_l2_5K=TCGA_only_f1_prot_QN_l2[order(var_TCGA_only, decreasing=TRUE)[1:5000],]
TCGA_only_f1_prot_QN_l2_m_5K=TCGA_only_f1_prot_QN_l2_m[order(var_TCGA_only, decreasing=TRUE)[1:5000],]

#Most variable genes (top 25)
sort(var_TCGA_only, decreasing=TRUE)[1:25]
##   SCGB2A2   SCGB1D2      TFF1       PIP      CPB1     MUCL1    CLEC3A 
## 15.771835 13.297163 12.522298 11.892873 10.766111 10.256921  9.396407 
##       LTF    CALML5    S100A7      TFF3      AGR3      AGR2     FDCSP 
##  8.803288  8.641087  8.517375  8.410280  7.886337  7.539050  7.339040 
##   CEACAM6    S100A9    CYP4Z1     NPY1R    BMPR1B     KRT14      KRT5 
##  6.995001  6.978111  6.907518  6.894458  6.651942  6.390517  5.884929 
##     GABRP     KRT17     GFRA1     KRT81 
##  5.883652  5.836392  5.752835  5.712550

Visualize the matrix as a heatmap to see how the features (gene expression as rows) define the subtypes of Breast Cancers

Supervised k-means clustering is used here. Again the goal is NOT to determine the number of row or column clusters here.

#Use the top5K features (genes) which normalized by its mean value for each row and visualize as a Heatmap
#Kmeans here
Heatmap(TCGA_only_f1_prot_QN_l2_m_5K,
        col = colorRamp2(c(-2, 0, 2),c("purple", "#EEEEEE", "yellow")),
        show_column_names = F, show_row_names = F,
        cluster_columns = T,cluster_rows = T,
        show_column_dend = T, show_row_dend = T,
        clustering_distance_rows = "pearson", clustering_distance_columns = "pearson",
        show_heatmap_legend =T,
        row_km = 8, column_km = 4
        )

Unsupervised Hierarchical clustering is used here

#Hierarchical and unsupervised clustering
Heatmap(TCGA_only_f1_prot_QN_l2_m_5K,
        col = colorRamp2(c(-2, 0, 2),c("blue", "#EEEEEE", "orange")),
        show_column_names = F, show_row_names = F,
        cluster_columns = T,cluster_rows = T,
        show_column_dend = T, show_row_dend = T,
        clustering_distance_rows = "pearson", clustering_distance_columns = "pearson",
        show_heatmap_legend =T
        )